{% extends 'base.html' %}
{% block mainpage %}import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as lines
import seaborn as sns
%matplotlib inline
Now read the data obtained in the data preparation step for July in 2015.
# Read subway turnstile data
Jul_subway_data = pd.read_csv('NYCT150701_150731.csv')
# Define function to get date and time information
def Parse_Time(data):
data['TIME'] = pd.to_datetime(data['TIME'],format='%Y/%m/%d %H:%M:%S')
data['Month'] = pd.Series([x.date().month for x in data['TIME']])
data['Day'] = pd.Series([x.date().day for x in data['TIME']])
data['Hour'] = pd.Series([x.time().hour for x in data['TIME']])
data['Weekday'] = pd.Series([x.date().weekday() for x in data['TIME']])
Weekday_dict = { 0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thur', 4: 'Fri', 5:'Sat', 6:'Sun'}
Month_dict = { 1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5:'May', 6:'Jun', 7: 'Jul', 8: 'Aug', 9:'Sep', 10:'Oct', 11:'Nov',12: 'Dec'}
data['Weekday'] = data['Weekday'].map(Weekday_dict)
data['Month'] = data['Month'].map(Month_dict)
data.drop(['TIME'],axis = 1, inplace = True)
return data
Jul_subway_data = Parse_Time(Jul_subway_data)
# Further format the data into six time groups and subset to only weekday data
def group_time(x):
if x>=0 and x<4:
return '0 - 4'
elif x < 8:
return '4 - 8'
elif x < 12:
return '8 - 12'
elif x < 16:
return '12 - 16'
elif x < 20:
return '16 - 20'
elif x < 24:
return '20 - 24'
Jul_subway_data['time_group'] = Jul_subway_data['Hour'].apply(group_time)
def is_weekday(x):
if x in ['Mon', 'Tue', 'Wed', 'Thur', 'Fri']:
return True
return False
Jul_subway_data['Busyness'] = Jul_subway_data['intvlExit'] + Jul_subway_data['intvlEntry']
Jul_subway_weekday = Jul_subway_data[Jul_subway_data['Weekday'].apply(is_weekday)].groupby(['Day', 'time_group'])['Busyness'].sum().reindex(['0 - 4', '4 - 8',
'8 - 12', '12 - 16',
'16 - 20', '20 - 24'], level=1)
In the above section, I define the "Busyness" of a station for a certain time period as the sum of entries and exits within that time period. In the section below, I define the netflow of a station in a certain time period as the difference between entries and exits during that time period.
Jul_subway_data['Net_flow_in'] = Jul_subway_data['intvlEntry']-Jul_subway_data['intvlExit']
Jul_weekday_station_netflow = Jul_subway_data[Jul_subway_data['Weekday'].apply(is_weekday)].groupby(['STATION', 'time_group'])['Net_flow_in'].mean().reindex(['0 - 4', '4 - 8',
'8 - 12', '12 - 16',
'16 - 20', '20 - 24'], level=1)
Jul_weekday_station_busyness = Jul_subway_data[Jul_subway_data['Weekday'].apply(is_weekday)].groupby(['STATION', 'time_group'])['Busyness'].mean().reindex(['0 - 4', '4 - 8',
'8 - 12', '12 - 16',
'16 - 20', '20 - 24'], level=1)
Jul_weekday_station_netflow = Jul_weekday_station_netflow.unstack()
Jul_weekday_station_busyness = Jul_weekday_station_busyness.unstack()
nyc_station_location = pd.read_csv('turnstile_weather_v2.csv')
nyc_station_longlat = nyc_station_location [['station', 'latitude','longitude']].groupby(['station'])['latitude','longitude'].mean()
Jul_weekday_station_netflow = Jul_weekday_station_netflow.merge(nyc_station_longlat,left_index = True, right_index = True)
Jul_weekday_station_busyness = Jul_weekday_station_busyness.merge(nyc_station_longlat,left_index = True, right_index = True)
def prepare_plot_data(x):
busyness_data = Jul_weekday_station_busyness[['latitude', 'longitude', x]]
netflow_data = Jul_weekday_station_netflow[[x]]
plot_data = busyness_data.merge(netflow_data, left_index=True, right_index=True)
x_column_name = x+'_x'
y_column_name = x+'_y'
plot_data = plot_data.rename(columns = {x_column_name: 'busyness', y_column_name: 'netflow'})
return plot_data
def transform_coordinates(series, old_origin, old_width, new_origin, new_width):
old_origin = float(old_origin)
old_width = float(old_width)
new_origin = float(new_origin)
new_width = float(new_width)
return new_origin + (series - old_origin) / old_width * new_width
Now we begin to plot the netflow of each station at different times during the day. The size of the markers represents the busyness of stations and the color of the markers represents the netflow of stations. Red means more entries, and blue means more exits.
def spatial_plot(time_group, title):
img = plt.imread('new-york.png')
ypixels, xpixels, bands = img.shape
dpi = 72.
xinch = xpixels / dpi
yinch = ypixels / dpi
fig = plt.figure(figsize=(xinch/2.0, yinch * 0.5/ .85))
plot_data = prepare_plot_data(time_group)
xpos = transform_coordinates(plot_data['longitude'], -74.1, .35, 0, xpixels)
ypos = transform_coordinates(plot_data['latitude'], 40.55, .35, ypixels, -ypixels)
plt.axes([0., 0., 1., .9], frameon=False, xticks=[], yticks=[])
# plot background image with map
plt.imshow(img, interpolation='none')
plt.scatter(x = xpos, y = ypos, c=plot_data['netflow'], s=plot_data['busyness']/8,
cmap = 'seismic', linewidths= 0, alpha=0.7)
plt.xlim(0, xpixels)
plt.ylim(ypixels, 0)
fig.suptitle(title, fontsize=16)
dot1 = lines.Line2D([0], [0], c='white', marker='o', mfc='gray', ms=4, mew=0)
dot2 = lines.Line2D([0], [0], c='white', marker='o', mfc='gray', ms=8, mew=0)
dot3 = lines.Line2D([0], [0], c='white', marker='o', mfc='red', ms=6, mew=0)
dot4 = lines.Line2D([0], [0], c='white', marker='o', mfc='blue', ms=6, mew=0)
legend = plt.legend([dot1, dot2, dot3, dot4],
['Stations with less passengers',
'Stations with more passengers',
'Stations with more people entering',
'Stations with more people exiting'], bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
ncol=2, mode="expand", borderaxespad=0., fontsize=9)
return plt
spatial_plot('0 - 4', 'NYC Subway Passengers Flow, 00:00 - 04:00')
spatial_plot('4 - 8', 'NYC Subway Passengers Flow, 04:00 - 08:00')
spatial_plot('8 - 12', 'NYC Subway Passengers Flow, 08:00 - 12:00')
spatial_plot('12 - 16', 'NYC Subway Passengers Flow, 12:00 - 16:00')
spatial_plot('16 - 20', 'NYC Subway Passengers Flow, 16:00 - 20:00')
spatial_plot('20 - 24', 'NYC Subway Passengers Flow, 20:00 - 24:00')